{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/ljj/anaconda/envs/python3env/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "height has been deprecated.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    " \n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import preprocessing\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "from sklearn.externals import joblib \n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.cross_validation import cross_val_score\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.linear_model import RidgeCV\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.linear_model import Lasso\n",
    "from sklearn.svm import SVR\n",
    "from sklearn.utils import shuffle\n",
    "import matplotlib.pyplot as plt\n",
    "import xgboost as xgb\n",
    "from lightgbm import LGBMRegressor\n",
    "import math\n",
    "%matplotlib inline\n",
    "\n",
    "pd.set_option('display.max_colwidth',1000)\n",
    "pd.set_option('display.height',1000)\n",
    "pd.set_option('display.max_rows',500)\n",
    "pd.set_option('display.max_columns',500)\n",
    "pd.set_option('display.width',1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "-----------------"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据预处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_data = pd.read_csv('./public.train.csv')\n",
    "test_data = pd.read_csv('./public.test.csv')\n",
    "\n",
    "df_result = pd.DataFrame()\n",
    "df_result['ID'] = list(test_data['ID'])\n",
    "special_missing_ID = test_data[test_data[(test_data == 0) | (test_data == 0.)].count(axis=1) > 13]['ID']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 异常值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def drop_all_outlier(df):\n",
    "    df.drop_duplicates(df.columns.drop('ID'), keep='first', inplace=True)\n",
    "    df.drop(df[(df.电压A > 800) | (df.电压A < 500)].index,inplace=True)\n",
    "    df.drop(df[(df.电压B > 800) | (df.电压B < 500)].index,inplace=True)\n",
    "    df.drop(df[(df.电压C > 800) | (df.电压C < 500)].index,inplace=True)\n",
    "    df.drop(df[(df.现场温度 > 30) | (df.现场温度 < -30)].index,inplace=True)\n",
    "    df.drop(df[(df.转换效率A > 100)].index,inplace=True)\n",
    "    df.drop(df[(df.转换效率B > 100)].index,inplace=True)\n",
    "    df.drop(df[(df.转换效率C > 100)].index,inplace=True)\n",
    "    df.drop(df[(df.风向 > 360)].index,inplace=True)\n",
    "    df.drop(df[(df.风速 > 20)].index,inplace=True)\n",
    "    return df\n",
    "\n",
    "cleaned_train_data = train_data.copy()\n",
    "cleaned_train_data = drop_all_outlier(cleaned_train_data)\n",
    "\n",
    "cleaned_sub_data = test_data.copy()\n",
    "cleaned_sub_data = drop_all_outlier(cleaned_sub_data)\n",
    "cleaned_sub_data_ID = cleaned_sub_data['ID']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "53"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data = pd.concat([train_data, test_data], axis=0).sort_values(by='ID').reset_index().drop(['index'], axis=1)\n",
    "bad_feature = ['ID', '功率A', '功率B', '功率C', '平均功率', '现场温度', '电压A', '电压B', '电压C', '电流B', '电流C', '转换效率', '转换效率A', '转换效率B', '转换效率C']\n",
    "bad_index = all_data[bad_feature][\n",
    "    (all_data[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std()) | \n",
    "    (all_data[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())\n",
    "].dropna(how='all').index\n",
    "\n",
    "nn_bad_data = all_data.loc[np.concatenate([bad_index - 1, bad_index, bad_index + 1])].sort_values(by='ID', ascending=True).drop_duplicates()\n",
    "bad_data = all_data.loc[bad_index].sort_values(by='ID', ascending=True)\n",
    "len(bad_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 上下记录均值替代异常值\n",
    "for idx, line in bad_data.iterrows():\n",
    "    ID = line['ID']\n",
    "    col_index = line[bad_feature][ \n",
    "        (line[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std())| \n",
    "        (line[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())\n",
    "    ].index\n",
    "    index = all_data[all_data['ID'] == ID].index\n",
    "    \n",
    "    before_offset = 1\n",
    "    while (idx + before_offset)in bad_index:\n",
    "        before_offset += 1\n",
    "\n",
    "    after_offset = 1\n",
    "    while (idx + after_offset) in bad_index:\n",
    "        after_offset += 1\n",
    "    \n",
    "    replace_value = (all_data.loc[index - before_offset, col_index].values + all_data.loc[index + after_offset, col_index].values) / 2\n",
    "    all_data.loc[index, col_index] = replace_value[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 拆分数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(9000, 8409)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data = all_data.drop(all_data[all_data['ID'].isin(df_result['ID'])].index).reset_index().drop(['index'], axis=1)\n",
    "test_data = all_data[all_data['ID'].isin(df_result['ID'])].drop(['发电量'], axis=1).reset_index().drop(['index'], axis=1)\n",
    "len(train_data), len(test_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 去除重复值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_data = train_data.drop_duplicates(train_data.columns.drop('ID'), keep='first')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 生成数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def generate_train_data(train_data, test_data, poly=False, select=False):\n",
    "    y = train_data['发电量']\n",
    "    X = train_data.drop(['发电量','ID'], axis=1)\n",
    "    sub_data = test_data.drop(['ID'], axis=1)\n",
    "    \n",
    "    polynm = None\n",
    "    if poly:\n",
    "        from sklearn.preprocessing import PolynomialFeatures\n",
    "        polynm = PolynomialFeatures(degree=2, interaction_only=True)\n",
    "        X = polynm.fit_transform(X)\n",
    "        sub_data = polynm.transform(sub_data)\n",
    "        \n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)\n",
    "    \n",
    "    sm = None\n",
    "    if select:\n",
    "        from sklearn.feature_selection import SelectFromModel\n",
    "        sm = SelectFromModel(GradientBoostingRegressor(random_state=2))\n",
    "        X_train = sm.fit_transform(X_train, y_train)\n",
    "        X_test = sm.transform(X_test)\n",
    "        sub_data = sm.transform(sub_data)\n",
    "        \n",
    "    return X_train, X_test, y_train, y_test, sub_data, sm, polynm\n",
    "\n",
    "def cal_score(mse):\n",
    "    if isinstance(mse, float):\n",
    "        return 1 / (1 + math.sqrt(mse))\n",
    "    else:\n",
    "        return np.divide(1, 1 + np.sqrt(mse))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test, sub_data, sm, polynm = generate_train_data(train_data, test_data, poly=True, select=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clean_X_train, clean_X_test, clean_y_train, clean_y_test, clean_sub_data, _, _ = generate_train_data(cleaned_train_data, cleaned_sub_data, poly=False, select=False)\n",
    "\n",
    "clean_X = np.concatenate([clean_X_train, clean_X_test])\n",
    "clean_y = np.concatenate([clean_y_train, clean_y_test])\n",
    "clean_X = polynm.transform(clean_X)\n",
    "clean_X = sm.transform(clean_X)\n",
    "\n",
    "clean_sub_data = polynm.transform(clean_sub_data)\n",
    "clean_sub_data = sm.transform(clean_sub_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "--------"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stacking Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tree Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "all_X_train = np.concatenate([X_train, X_test])\n",
    "all_y_train = np.concatenate([y_train, y_test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def cross_validation_test(models, train_X_data, train_y_data, cv=5):\n",
    "    model_name, mse_avg, score_avg = [], [], []\n",
    "    for i, model in enumerate(models):\n",
    "        print(i + 1,'- Model:', str(model).split('(')[0])\n",
    "        model_name.append(str(i + 1) + '.' + str(model).split('(')[0])\n",
    "        nmse = cross_val_score(model, train_X_data[i], train_y_data[i], cv=cv, scoring='neg_mean_squared_error')\n",
    "        avg_mse = np.average(-nmse)\n",
    "        scores = cal_score(-nmse)\n",
    "        avg_score = np.average(scores)\n",
    "        mse_avg.append(avg_mse)\n",
    "        score_avg.append(avg_score)\n",
    "        print('MSE:', -nmse)\n",
    "        print('Score:', scores)\n",
    "        print('Average XGB - MSE:', avg_mse, ' - Score:', avg_score, '\\n')\n",
    "    res = pd.DataFrame()\n",
    "    res['Model'] = model_name\n",
    "    res['Avg MSE'] = mse_avg\n",
    "    res['Avg Score'] = score_avg\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "xgbt1 = xgb.XGBRegressor(n_estimators=950, max_depth=3, max_features='sqrt', random_state=2, n_jobs=8)\n",
    "xgbt2 = xgb.XGBRegressor(n_estimators=1000, max_depth=3, max_features='sqrt', random_state=3, n_jobs=8)\n",
    "xgbt3 = xgb.XGBRegressor(n_estimators=1100, max_depth=3, max_features='sqrt', random_state=4, n_jobs=8)\n",
    "\n",
    "gbdt1 = GradientBoostingRegressor(n_estimators=500, max_depth=3, max_features='sqrt', random_state=2)\n",
    "gbdt2 = GradientBoostingRegressor(n_estimators=400, max_depth=3, max_features='sqrt', random_state=3)\n",
    "gbdt3 = GradientBoostingRegressor(n_estimators=500, max_depth=4, max_features='log2', random_state=4)\n",
    "\n",
    "forest1 = RandomForestRegressor(n_estimators=300, max_features='sqrt', random_state=2, n_jobs=8)\n",
    "forest2 = RandomForestRegressor(n_estimators=300, max_features='log2', random_state=3, n_jobs=8)\n",
    "forest3 = RandomForestRegressor(n_estimators=600, max_features='sqrt', random_state=4, n_jobs=8) \n",
    "\n",
    "lgb1 = LGBMRegressor(n_estimators=900, max_depth=5, random_state=2, n_jobs=8) \n",
    "lgb2 = LGBMRegressor(n_estimators=850, max_depth=4, random_state=3, n_jobs=8)\n",
    "lgb3 = LGBMRegressor(n_estimators=720, max_depth=4, random_state=4, n_jobs=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 - Model: XGBRegressor\n",
      "MSE: [0.01293621 0.02728273 0.0141983  0.08662671 0.01538725]\n",
      "Score: [0.89787768 0.85824029 0.89352999 0.77260399 0.88964381]\n",
      "Average XGB - MSE: 0.031286239207819554  - Score: 0.8623791511542953 \n",
      "\n",
      "2 - Model: XGBRegressor\n",
      "MSE: [0.0128769  0.02723475 0.01410293 0.08655349 0.01539099]\n",
      "Score: [0.89808815 0.85834732 0.89385014 0.77267825 0.88963189]\n",
      "Average XGB - MSE: 0.031231813908194888  - Score: 0.8625191520279886 \n",
      "\n",
      "3 - Model: XGBRegressor\n",
      "MSE: [0.01284835 0.02716063 0.01401689 0.08649669 0.01530766]\n",
      "Score: [0.89818971 0.85851292 0.89414012 0.7727359  0.88989812]\n",
      "Average XGB - MSE: 0.03116604391867061  - Score: 0.8626953542814435 \n",
      "\n",
      "4 - Model: GradientBoostingRegressor\n",
      "MSE: [0.01312464 0.02290779 0.01361585 0.08404209 0.01823618]\n",
      "Score: [0.89721279 0.86854321 0.89550609 0.77525381 0.88102513]\n",
      "Average XGB - MSE: 0.030385310410269484  - Score: 0.8635082063327202 \n",
      "\n",
      "5 - Model: GradientBoostingRegressor\n",
      "MSE: [0.01465626 0.02596406 0.01705141 0.08444769 0.01734632]\n",
      "Score: [0.89201049 0.86122728 0.88450095 0.7748341  0.88362217]\n",
      "Average XGB - MSE: 0.031893147167612504  - Score: 0.8592390001025452 \n",
      "\n",
      "6 - Model: GradientBoostingRegressor\n",
      "MSE: [0.01035818 0.02108569 0.01321951 0.08394604 0.01557163]\n",
      "Score: [0.90762619 0.87320295 0.8968802  0.77535342 0.88905773]\n",
      "Average XGB - MSE: 0.02883621224382915  - Score: 0.8684240984313913 \n",
      "\n",
      "7 - Model: RandomForestRegressor\n",
      "MSE: [0.01235815 0.02196859 0.01382578 0.08761554 0.01590157]\n",
      "Score: [0.89995456 0.87091473 0.89478807 0.7716054  0.88801949]\n",
      "Average XGB - MSE: 0.03033392435530686  - Score: 0.8650564497300948 \n",
      "\n",
      "8 - Model: RandomForestRegressor\n",
      "MSE: [0.01247722 0.02228493 0.01383948 0.08720759 0.01571226]\n",
      "Score: [0.89952206 0.87010894 0.89474144 0.77201638 0.88861358]\n",
      "Average XGB - MSE: 0.030304294479981016  - Score: 0.8650004806517322 \n",
      "\n",
      "9 - Model: RandomForestRegressor\n",
      "MSE: [0.01214584 0.02203634 0.01372344 0.0878307  0.01628677]\n",
      "Score: [0.90073199 0.87074153 0.89513728 0.77138921 0.88682388]\n",
      "Average XGB - MSE: 0.030404616021233927  - Score: 0.8649647786090968 \n",
      "\n",
      "10 - Model: LGBMRegressor\n",
      "MSE: [0.01143217 0.02367465 0.01179238 0.08493677 0.01574635]\n",
      "Score: [0.90340658 0.866652   0.90204453 0.77432995 0.88850626]\n",
      "Average XGB - MSE: 0.029516463695854776  - Score: 0.8669878641222493 \n",
      "\n",
      "11 - Model: LGBMRegressor\n",
      "MSE: [0.01154947 0.02329073 0.01239651 0.08416621 0.01531062]\n",
      "Score: [0.90296025 0.86759388 0.89981494 0.77512522 0.88988867]\n",
      "Average XGB - MSE: 0.029342709054010437  - Score: 0.8670765908945898 \n",
      "\n",
      "12 - Model: LGBMRegressor\n",
      "MSE: [0.01163493 0.0230706  0.01239519 0.08436435 0.01522609]\n",
      "Score: [0.90263678 0.86813838 0.89981973 0.77492022 0.89015961]\n",
      "Average XGB - MSE: 0.02933823395688176  - Score: 0.8671349439414193 \n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Model</th>\n",
       "      <th>Avg MSE</th>\n",
       "      <th>Avg Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.XGBRegressor</td>\n",
       "      <td>0.031286</td>\n",
       "      <td>0.862379</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.XGBRegressor</td>\n",
       "      <td>0.031232</td>\n",
       "      <td>0.862519</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.XGBRegressor</td>\n",
       "      <td>0.031166</td>\n",
       "      <td>0.862695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.GradientBoostingRegressor</td>\n",
       "      <td>0.030385</td>\n",
       "      <td>0.863508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.GradientBoostingRegressor</td>\n",
       "      <td>0.031893</td>\n",
       "      <td>0.859239</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6.GradientBoostingRegressor</td>\n",
       "      <td>0.028836</td>\n",
       "      <td>0.868424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7.RandomForestRegressor</td>\n",
       "      <td>0.030334</td>\n",
       "      <td>0.865056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8.RandomForestRegressor</td>\n",
       "      <td>0.030304</td>\n",
       "      <td>0.865000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9.RandomForestRegressor</td>\n",
       "      <td>0.030405</td>\n",
       "      <td>0.864965</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10.LGBMRegressor</td>\n",
       "      <td>0.029516</td>\n",
       "      <td>0.866988</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11.LGBMRegressor</td>\n",
       "      <td>0.029343</td>\n",
       "      <td>0.867077</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12.LGBMRegressor</td>\n",
       "      <td>0.029338</td>\n",
       "      <td>0.867135</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          Model   Avg MSE  Avg Score\n",
       "0                1.XGBRegressor  0.031286   0.862379\n",
       "1                2.XGBRegressor  0.031232   0.862519\n",
       "2                3.XGBRegressor  0.031166   0.862695\n",
       "3   4.GradientBoostingRegressor  0.030385   0.863508\n",
       "4   5.GradientBoostingRegressor  0.031893   0.859239\n",
       "5   6.GradientBoostingRegressor  0.028836   0.868424\n",
       "6       7.RandomForestRegressor  0.030334   0.865056\n",
       "7       8.RandomForestRegressor  0.030304   0.865000\n",
       "8       9.RandomForestRegressor  0.030405   0.864965\n",
       "9              10.LGBMRegressor  0.029516   0.866988\n",
       "10             11.LGBMRegressor  0.029343   0.867077\n",
       "11             12.LGBMRegressor  0.029338   0.867135"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cross_validation_test(\n",
    "    models=[    \n",
    "        xgbt1, xgbt2, xgbt3,\n",
    "        gbdt1, gbdt2, gbdt3,\n",
    "        forest1, forest2, forest3,\n",
    "        lgb1, lgb2, lgb3\n",
    "    ],\n",
    "    train_X_data=[\n",
    "        all_X_train, all_X_train, all_X_train, all_X_train,\n",
    "        all_X_train, all_X_train, all_X_train, all_X_train,\n",
    "        all_X_train, all_X_train, all_X_train, all_X_train\n",
    "    ],\n",
    "    train_y_data=[\n",
    "        all_y_train, all_y_train, all_y_train, all_y_train,\n",
    "        all_y_train, all_y_train, all_y_train, all_y_train,\n",
    "        all_y_train, all_y_train, all_y_train, all_y_train\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### KNN models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "\n",
    "knn1 = KNeighborsRegressor(n_neighbors=7, p=1)\n",
    "knn2 = KNeighborsRegressor(n_neighbors=8, p=2)\n",
    "knn3 = KNeighborsRegressor(n_neighbors=6, p=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 - Model: KNeighborsRegressor\n",
      "MSE: [0.01765135 0.02923458 0.02315944 0.09964368 0.02050656]\n",
      "Score: [0.88272285 0.85398466 0.86791824 0.76007256 0.87473673]\n",
      "Average XGB - MSE: 0.03803912123551127  - Score: 0.8478870089577384 \n",
      "\n",
      "2 - Model: KNeighborsRegressor\n",
      "MSE: [0.01837335 0.03597406 0.02861069 0.10066955 0.02254798]\n",
      "Score: [0.88063183 0.84057043 0.85532447 0.75913737 0.86944436]\n",
      "Average XGB - MSE: 0.041235127004732156  - Score: 0.8410216935327213 \n",
      "\n",
      "3 - Model: KNeighborsRegressor\n",
      "MSE: [0.01788141 0.02886297 0.02147256 0.09904605 0.02112333]\n",
      "Score: [0.88205092 0.85478044 0.87219303 0.76062065 0.87310419]\n",
      "Average XGB - MSE: 0.03767726548728545  - Score: 0.8485498462193644 \n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Model</th>\n",
       "      <th>Avg MSE</th>\n",
       "      <th>Avg Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.KNeighborsRegressor</td>\n",
       "      <td>0.038039</td>\n",
       "      <td>0.847887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.KNeighborsRegressor</td>\n",
       "      <td>0.041235</td>\n",
       "      <td>0.841022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.KNeighborsRegressor</td>\n",
       "      <td>0.037677</td>\n",
       "      <td>0.848550</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Model   Avg MSE  Avg Score\n",
       "0  1.KNeighborsRegressor  0.038039   0.847887\n",
       "1  2.KNeighborsRegressor  0.041235   0.841022\n",
       "2  3.KNeighborsRegressor  0.037677   0.848550"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cross_validation_test(\n",
    "    models=[knn1, knn2, knn3],\n",
    "    train_X_data=[\n",
    "        all_X_train, all_X_train, all_X_train, \n",
    "    ],\n",
    "    train_y_data=[\n",
    "        all_y_train, all_y_train, all_y_train,\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Stacking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "regrs = [\n",
    "    xgbt1, gbdt1, forest1, lgb1, knn1, \n",
    "    xgbt2, gbdt2, forest2, lgb2, knn2, \n",
    "    xgbt3, gbdt3, forest3, lgb3, knn3\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class Stacker(object):\n",
    "    def __init__(self, n_splits, stacker, base_models):\n",
    "        self.n_splits = n_splits\n",
    "        self.stacker = stacker\n",
    "        self.base_models = base_models\n",
    "    \n",
    "    # X: 原始训练集, y: 原始训练集真实值, predict_data: 原始待预测数据\n",
    "    def fit_predict(self, X, y, predict_data):\n",
    "        X = np.array(X)\n",
    "        y = np.array(y)\n",
    "        T = np.array(predict_data)\n",
    "\n",
    "        folds = list(KFold(n_splits=self.n_splits, shuffle=False, random_state=2018).split(X, y))\n",
    "        \n",
    "        # 以基学习器预测结果为特征的 stacker的训练数据 与 stacker预测数据\n",
    "        S_train = np.zeros((X.shape[0], len(self.base_models)))\n",
    "        S_predict = np.zeros((T.shape[0], len(self.base_models)))\n",
    "        \n",
    "        for i, regr in enumerate(self.base_models):\n",
    "            print(i + 1, 'Base model:', str(regr).split('(')[0])\n",
    "            S_predict_i = np.zeros((T.shape[0], self.n_splits))\n",
    "            \n",
    "            for j, (train_idx, test_idx) in enumerate(folds):\n",
    "                # 将X分为训练集与测试集\n",
    "                X_train, y_train, X_test, y_test = X[train_idx], y[train_idx], X[test_idx], y[test_idx]\n",
    "                print ('Fit fold', (j+1), '...')\n",
    "                regr.fit(X_train, y_train)\n",
    "                y_pred = regr.predict(X_test)                \n",
    "                S_train[test_idx, i] = y_pred\n",
    "                S_predict_i[:, j] = regr.predict(T)\n",
    "            \n",
    "            S_predict[:, i] = S_predict_i.mean(axis=1)\n",
    "\n",
    "        nmse_score = cross_val_score(self.stacker, S_train, y, cv=5, scoring='neg_mean_squared_error')\n",
    "        print('CV MSE:', -nmse_score)\n",
    "        print('Stacker AVG MSE:', -nmse_score.mean(), 'Stacker AVG Score:', np.mean(np.divide(1, 1 + np.sqrt(-nmse_score))))\n",
    "\n",
    "        self.stacker.fit(S_train, y)\n",
    "        res = self.stacker.predict(S_predict)\n",
    "        return res, S_train, S_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# stacking_mode1 = Ridge(alpha=0.008, copy_X=True, fit_intercept=False, solver='auto', random_state=2)\n",
    "stacking_model = SVR(C=100, gamma=0.01, epsilon=0.01)\n",
    "stacker = Stacker(5, stacking_model, regrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 Base model: XGBRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "2 Base model: GradientBoostingRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "3 Base model: RandomForestRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "4 Base model: LGBMRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "5 Base model: KNeighborsRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "6 Base model: XGBRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "7 Base model: GradientBoostingRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "8 Base model: RandomForestRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "9 Base model: LGBMRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "10 Base model: KNeighborsRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "11 Base model: XGBRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "12 Base model: GradientBoostingRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "13 Base model: RandomForestRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "14 Base model: LGBMRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "15 Base model: KNeighborsRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "CV MSE: [0.00838336 0.01917123 0.00950782 0.08465478 0.01305851]\n",
      "Stacker AVG MSE: 0.026955136851220928 Stacker AVG Score: 0.8755439613575262\n"
     ]
    }
   ],
   "source": [
    "pred_stack, S_train_data, S_predict_data = stacker.fit_predict(all_X_train, all_y_train, sub_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "stacking_model2 = SVR(C=100, gamma=0.01, epsilon=0.01)\n",
    "stacker2 = Stacker(5, stacking_model2, regrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 Base model: XGBRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "2 Base model: GradientBoostingRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "3 Base model: RandomForestRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "4 Base model: LGBMRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "5 Base model: KNeighborsRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "6 Base model: XGBRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "7 Base model: GradientBoostingRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "8 Base model: RandomForestRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "9 Base model: LGBMRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "10 Base model: KNeighborsRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "11 Base model: XGBRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "12 Base model: GradientBoostingRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "13 Base model: RandomForestRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "14 Base model: LGBMRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "15 Base model: KNeighborsRegressor\n",
      "Fit fold 1 ...\n",
      "Fit fold 2 ...\n",
      "Fit fold 3 ...\n",
      "Fit fold 4 ...\n",
      "Fit fold 5 ...\n",
      "CV MSE: [0.00896606 0.01730375 0.05738138 0.01084817 0.00934484]\n",
      "Stacker AVG MSE: 0.020768839888940183 Stacker AVG Score: 0.8843041207073812\n"
     ]
    }
   ],
   "source": [
    "pred_clean_stack, S_clean_train_data, S_clean_predict_data = stacker2.fit_predict(clean_X, clean_y, clean_sub_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_result['score'] = pred_stack"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "index = df_result[df_result['ID'].isin(special_missing_ID)].index\n",
    "df_result.loc[index, 'score'] = 0.379993053"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "c_index = df_result[df_result['ID'].isin(cleaned_sub_data_ID)].index\n",
    "df_result.loc[c_index, 'score'] = pred_clean_stack"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_result.to_csv('submit_stack_svm_15_poly_select_dropdup_outlier_clean.csv', index=False, header=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
